Manuscript draft (google doc) available here: https://docs.google.com/document/d/15joiEhDGsYJTgS0y5KkjgVT_Usv1emeCA80fxc0Odkk/edit?usp=sharing


TODO

  • Site summary data

  • Biweekly summary of variables

    • Figures plus events timeline
    • Correlation & Heatmap
    • Add case counts / fix figure
  • Case Date Imputation. about that..

  • Model for cases ~ .

  • Test high vs low traffic as FE in RE model

  • Finalize fig. 2

  • Finalize fig. 3

  • Fig. 4: refresh data

  • Finish written methods and results


Analysis code
# setup ---------------------------------------------
library(tidyverse)
library(here)
library(ggiraph)
library(patchwork)
library(ggtext)
library(janitor)

ggplot2::theme_set(theme_bw() + theme(
  legend.text = element_markdown(family = 'IBM Plex Sans', colour = 'gray30'),
  axis.text = element_markdown(family = 'IBM Plex Sans', colour = 'gray30'),
  axis.title = element_markdown(family = 'IBM Plex Sans',colour = 'gray30'),
))
hrbrthemes::import_plex_sans()

source(here('R/report_plots.R'))

# Functions -----------------------------------------

convert_cq_to_copies <- function(cq) 10 ** ((40.1 - cq) / 3.23)

# creates yyyy-ww label for grouping data
get_date_week <- function(x){
  y <- lubridate::year(x)
  w <- lubridate::week(x) |> str_pad(2, 'left', 0)
  str_glue("{y}-{w}")
}

# reverses get_date_week to Weds date during week
week_to_date <- function(year_week){
  year <- str_extract(year_week, '^....')
  d <- str_glue('{year}-01-01') |> as_date()
  wk <- str_extract(year_week, '..$') |> as.integer()
  ydays <- if_else(
    year == '2021',
    days(7*wk - 2),
    days(7*wk - 3),
  )
  return(d + ydays)
}


# get biweekly bin date for vector of dates
get_date_biweekly <- function(x){
  day_of_week <-  lubridate::wday(x)
  case_when(
    day_of_week %in% 1:3 ~ x + 3 - day_of_week,
    TRUE ~ x + 5 - day_of_week,
  )
}

# dates 2021-2022 with biweekly groups
biweekly_date_sequence <- function(){
  tibble(
    date = seq(
      as_date('2021-01-01'), 
      as_date('2022-12-31'), 
      by = 1)
  ) |> 
    mutate(biweek = get_date_biweekly(date))
}

# lookup table for date
week_midpoint_date_lookup <- function(start = '2021-01-01', end = '2023-01-01'){
  ends = lubridate::as_date(c(start, end))
  tibble(
    date = seq(ends[1], ends[2], by = 1),
    date_week = lubridate::week(date) |> str_pad(2, 'left', '0'),
    date_year = lubridate::year(date),
    week = str_glue("{date_year}-{date_week}"),
  ) |> 
    group_by(week) |> 
    summarise(date = mean(date))
}

# convert site1/site2 to hi/low traffic
get_traffic_level <- function(location){
  if_else(
    str_detect(location, 'Site 1'), 
    'high traffic', 
    'low traffic'
  )
} 

binom_ci <- function(x, n){
  Hmisc::binconf(x, n, method = 'wilson', alpha = 0.05) |> 
    as_tibble() |> 
    janitor::clean_names()
}

get_binom_ci <- function(data){
  data |> 
    summarise(
      x = sum(pcr_positive == 'Positive'),
      n = n(),
      binconf = map2(x, n, ~binom_ci(.x, .y))
    ) |> 
    unnest(binconf) |> 
    mutate(across(c(point_est, lower, upper), ~(.x*100) |> round(1)))
}

# Plot Elements ---------------------------------------

theme_color <- 'cornflowerblue'

# layout x-axis
scale_x_study_dates <- function(){
    scale_x_date(
      date_breaks = 'month',
      date_labels = month.name[c(9:12, 1:5)] |> strtrim(3) |> 
        paste0(' ', c(rep(2021, 4), rep(2022, 5))),
      limits = c(
        min(swabs_tidy$date_swab), 
        max(swabs_tidy$date_swab)
      ))
}

# get rid of x-axis for vertically stacked plots
theme_no_x_labels <- function(){
  theme(
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_blank()
  )
}

# get rid of y-axis for horizontally stacked plots
theme_no_y_labels <- function(){
  theme(
    axis.ticks.y = element_blank(),
    axis.title.y = element_blank(),
    axis.text.y = element_blank()
  )
}

# no minor grid and adjust spacing for multipanel
theme_remove_grid <- function(){
  theme(
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    plot.title.position = 'panel', 
    plot.title = element_markdown(
      vjust = -1, 
      size = 8, 
      face = 'bold', 
      family = 'IBM Plex Sans',
      colour = 'gray50',
      margin = margin(0, 0, 0, 0)),
    plot.margin = unit(c(0, 0, 0, 0), 'mm'))
}

# Data ------------------------------------------------

# swab data
swabs <- 
  read_rds(here('data/cube.rds')) |> 
  filter(str_detect(site, '^UO_')) 

# filtered swabs without controls or sponge samples
swabs_tidy <- swabs |> 
  filter(!negative_control, 
         swab_type != 'sponge',
         date_swab < '2022-04-27') |> 
  select(date_swab, site, floor, location, pcr_positive, pcr_ct, co2) |> 
  mutate(biweek = get_date_biweekly(date_swab))

# lookup table for waste water site names
lookup_ww <- tribble(
  ~site_abbr, ~site_name,
  'UO_AA',  'Annex Residence',
  'UO_FA',  'Faculty of Social Sciences',
  'UO_FT',  'Friel Residence',
  'UO_NA',  'Northern sampling site',
  'UO_RGN', 'Roger Guindon Hall',
  'UO_ST',  'Southern sampling site',
  'UO_TBT', 'Tabaret Hall'
)


# UOttawa waste water data from R. Delatolla to clean
clean_ww_excel <- function(){
  readxl::read_excel(here::here('data/ww_university.xlsx')) |> 
    janitor::clean_names() |> 
    transmute(
      sample_date = as_date(sample_date),
      site = source_name,
      signal = viral_copies_per_copies_pep_avg
    ) |> 
    mutate(
      site = str_remove_all(
        site, 'Data\\s-\\suOttawa\\s-\\s|.xlsx'
      ) |> 
        str_to_upper()
    ) |>
    filter(site != 'UO_RGN', !is.na(signal)) 
}

clean_ww_excel() |> 
  write_csv(here::here('data/ww_university_clean.csv'))

rm(clean_ww_excel)

# clean WW data close to study period, no RGN site.
uottawa_ww <-
  read_csv(here::here('data/ww_university_clean.csv'),
           show_col_types = FALSE) |> 
  filter(
    sample_date > '2021-09-15',
    sample_date < '2022-05-05'
  ) |>
  left_join(lookup_ww, by = c('site' = 'site_abbr')) 

# ottawa wastewater data: daily means
regional_ww <- 
  read_rds(here('data/ww_ottawa.rds')) |> 
  select(sample_date, starts_with('cov_')) |> 
  pivot_longer(contains('cov_')) |> 
  mutate(target = str_extract(name, 'cov_n.'),
         stat = str_extract(name, 'mean|sd'),
         ) |> 
  select(-name) |> 
  pivot_wider(names_from = stat, values_from = value) |> 
  mutate(.lower = mean - sd, .upper = mean + sd)   

# data from university of ottawa
wifi <- 
  read_rds(here('data/uo_wifi.rds')) |> 
  filter(date >= min(swabs$date_swab), 
         date <= max(swabs$date_swab))

# important dates for context
event_dates <- 
  tribble(
    ~event, ~start, ~end,
    'Reading Week',  '2021-10-24', '2021-10-30',
    'Holiday Break', '2021-12-22', '2022-01-04',
    'Closure',       '2022-01-04', '2022-01-31',
    'Closure',       '2022-02-16', '2022-02-21',
    'Reading Week',  '2022-02-20', '2022-02-26',
  ) |> 
  mutate(across(start:end, as_date))

# UOttawa cases data
# clean data from: source(here('R/01_impute_missing_case_dates.R'))
cases <-
  read_rds(here('data/cases_rule_imputed.rds')) |> 
  select(case, role, case_date, UC:TBT) |>
  mutate(biweek = get_date_biweekly(case_date)) |> 
  nest(associated_sites = UC:TBT)


total_n_cases_study_period <- 
  cases |> 
  filter(case_date > '2021-09-20') |> 
  nrow()


# Summaries -----------------------------------------
# 
positivity_summary <- function(swabs){
  swabs |> 
    summarise(
      n = n(),
      x = sum(pcr_positive == 'Positive', na.rm = F),
      positivity = round(100 * x / n, digits = 1),
      .groups = 'drop')
}

co2_summary <- function(swabs){
  swabs |> 
    summarise(
      n = n(),
      co2_vals = list(co2),
      co2_mean = mean(co2, na.rm = T),
      .groups = 'drop')
}
  
# overall summary
swab_summary <- swabs_tidy |> positivity_summary()

# site summary
swab_summary_sites <- swabs_tidy |> 
  group_by(site) |> 
  positivity_summary()
  
# high-low traffic summary
swab_summary_location_traffic <- 
  swabs_tidy |> 
  mutate(traffic = get_traffic_level(location)) |> 
  group_by(traffic) |> 
  positivity_summary()

## Figure 1: data aggregation --------------------------

# uottawa cases - biweekly counts 
cases_biweekly <- 
  cases |> 
  select(case, biweek, role) |> 
  group_by(biweek) |> 
  summarise(cases = n(),
            cases_student = sum(role == 'Student'),
            cases_staff = sum(role == 'Employee'),
            ) |> 
  right_join(
    biweekly_date_sequence() |> 
      filter(biweek < '2022-02-03') |> 
      distinct(biweek),
    by = 'biweek'
  ) |> 
  mutate(across(where(is.integer), ~if_else(is.na(.), 0L, .))) 

# swabs biweekly summary
swabs_biweekly <- 
  swabs_tidy |>
  group_by(biweek) |> 
  positivity_summary() |> 
  left_join(swabs_tidy |> group_by(biweek) |> co2_summary(),
            by = join_by(biweek, n))

# swabs biweekly summary by site  
swabs_biweekly_by_site <- 
  swabs_tidy |>
  group_by(site, biweek) |> 
  positivity_summary() |> 
  left_join(
    swabs_tidy |> group_by(site, biweek) |> co2_summary(),
    by = join_by(site, biweek, n)
  )

# UOttawa ww biweekly means
uottawa_ww_biweekly <- 
  uottawa_ww |> 
  mutate(biweek = get_date_biweekly(sample_date)) |> 
  group_by(biweek) |> 
  summarise(signal = mean(signal),
            n = n())

# daily ww data for study period
regional_ww_daily <- 
  regional_ww |> 
  filter(
    sample_date >= min(swabs_tidy$date_swab) - 4,
    sample_date <= max(swabs_tidy$date_swab) + 4,
  ) |> 
  group_by(sample_date) |> 
  summarise(mean = mean(mean))

# regional ww biweekly means
regional_ww_biweekly <- 
  regional_ww_daily |> 
  mutate(biweek = get_date_biweekly(sample_date)) |> 
  group_by(biweek) |> 
  summarise(mean = mean(mean))

# wifi site agg
wifi_extended <- read_rds(here('data/uo_wifi.rds')) |> 
  filter(date >= min(swabs$date_swab) - 2, 
         date <= max(swabs$date_swab) + 2) |> 
  mutate(biweek = get_date_biweekly(date))

# overall wifi biweekly timeseries
wifi_biweekly <- 
  wifi_extended |> 
  group_by(biweek) |> 
  summarise(
    n = n(),
    min = min(clients, na.rm = T),
    mean = mean(clients, na.rm = T),
    max = max(clients, na.rm = T)
  )
# per building wifi biweekly timeseries
wifi_biweek_sites <- 
  wifi_extended |> 
  group_by(biweek, site) |> 
  summarise(
    n = n(),
    min = min(clients),
    mean = mean(clients),
    max = max(clients),
    .groups = 'drop'
  )

# Map --------------------------------------------------

swab_coords <- tribble(
  ~site, ~name, ~lat, ~lng,
  'MRT', 'Morrisette Hall (MRT)', 45.4232391, -75.684289,
  'MNT', 'Montpetit', 45.4225417102, -75.6826587146,
  '90U', '90 University', 45.422425557, -75.68501449,
  'DMS', 'Desmarais Bldg.', 45.4238767934, -75.687270754,
  'TBT', 'Tabaret Hall', 45.4245094016, -75.6863190018,
  'LPR',  'Louis Pasteur Bldg.', 45.42137566861, -75.6802638999,
)

ww_coords <- tribble(
  ~site, ~name, ~lat, ~lng,
  'aa', 'Annex Residence', 45.4267812902, -75.6803440596,
  'fa', 'Faculty of Soc. Sci.', 45.4216247, -75.6839038601,
  'ft', 'Friel Residence', 45.4304344008, -75.6829076653,
  'tbt', 'Tabaret Hall', 45.4245094016, -75.6863190018
  # 'na', 'North sampling Site', NA, NA,
  # 'st', 'South Sampling Site', NA, NA, # Nobody knows where this is
)

figure_sites_map <- 
  leaflet::leaflet(swab_coords, width = 600, height = 330) |> 
  leaflet::addProviderTiles(
    leaflet::providers$CartoDB.Positron
  ) |> 
  leaflet::setView(lng = -75.6843, lat = 45.4235, zoom = 14) |> 
  leaflet::addCircleMarkers(
    ~lng, ~lat, 
    label = ~name, 
    popup = ~name, 
    radius = 2, 
    color = '#440099'
    ) |> 
  leaflet::addLabelOnlyMarkers(
    ~lng, ~lat, 
    label =  ~site, 
    labelOptions = leaflet::labelOptions(
      noHide = T, direction = 'top', textOnly = T, 
    )) |> 
  leaflet::addCircleMarkers(
    ~lng, 
    ~lat,
    label = ~site, 
    radius = 2,
    color = '#440099'
    ) |> 
  leaflet::addCircleMarkers(
    data = ww_coords, 
    ~lng,
    ~lat, 
    label =  ~name, 
    popup =  ~name, 
    radius = 4, 
    color = '#f96999'
    )


# Results ----------------------------------------------------

rs_data <- lst(
  swabs_collected = nrow(swabs_tidy),
  positivity_ovr = swab_summary$positivity,
  positivity_site_min = min(swab_summary_sites$positivity),
  positivity_site_max = max(swab_summary_sites$positivity),
)

rs_abstract <- rs_data |> 
  glue::glue_data(
    "Over the 32-week study period, we collected {swabs_collected} swabs at six university buildings, including two buildings where wastewater surveillance was currently ongoing. 
Overall, {positivity_ovr}% of swabs were PCR-positive for SARS-CoV-2, with individual buildings ranging between {positivity_site_min}% and {positivity_site_max}%.
*Comment on when spikes in cases, swabs, and waste-water occurred….* *Comment on prediction of cases using swabs, ww, or combined approach…*
"
  )

# rs_results <- rs_data |> glue::glue_data()


## corr -----

# joined data for pairwise correlation 
biweekly_data <- swabs_biweekly |> 
  transmute(biweek, `Swab PCR` = positivity, `CO2` = co2_mean) |> 
  left_join(
    wifi_biweekly |> transmute(biweek, `Wi-Fi` = mean),
    by = 'biweek'
    ) |> 
  left_join(
    uottawa_ww_biweekly |> transmute(biweek, `University WW` = signal),
    by = 'biweek'
  ) |> 
  left_join(
    regional_ww_biweekly |> transmute(biweek, `Ottawa WW` = mean),
    by = 'biweek'
  ) |> 
  left_join(
    cases_biweekly |> transmute(biweek, Cases = cases),
    by = 'biweek'
  )

corr_spearman <- biweekly_data |> 
  select(-biweek) |> 
  corrr::correlate(use = 'pairwise.complete.obs',
                   method = 'spearman')

# to get pvalues...
corrtest <- 
  biweekly_data |> 
  select(-biweek) |> 
  psych::corr.test(
    use = 'pairwise',
    method = 'spearman',
    adjust = 'none' # See p.adjust for "holm" > "bonferroni").
  )

# spearman r and p-values; upper triangle has adj. correlations
corr_r <- corrtest$r |> round(3)
corr_p <- corrtest$p |> round(3)

# remove lower tri with raw probabilities
corr_r[lower.tri(corr_r, diag = T)] <- NA
corr_p[lower.tri(corr_p, diag = T)] <- NA

corr_tbl_r <- kableExtra::kable(
  x = corr_r,
  format = 'pipe',
  caption = 'Campus-wide, biweekly metrics: Spearman r'
)
corr_tbl_p <- kableExtra::kable(
  x = corr_p,
  format = 'pipe',
  caption = 'Campus-wide, biweekly metrics: p-value on Spearman r'
)

corr_ci <- corrtest |> 
  pluck('ci') |>
  as_tibble() |> 
  mutate(terms = rownames(corrtest$ci), .before = 1L)

rm(corr_r, corr_p, corrtest)
Expand for plotting details
# fig1 -----
fig1_timeline <- 
  event_dates |> 
  ggplot(aes(x = start, xend = end, y = event, yend = event)) +
  geom_segment(linewidth = 4, lineend = 'round',
               color = theme_color,
               alpha = 0.3) +
  geom_segment(linewidth = 1, lineend = 'butt',
             color = theme_color,
             alpha = 0.9) +
  geom_point(aes(x = end), color = theme_color, size = 1.6) +
  geom_point(color = theme_color, size = 1.6) +
  scale_x_study_dates() +
  labs(title = 'Timeline') +
  theme_no_x_labels() +
  theme_remove_grid()

fig1_cases <-
  cases_biweekly |>
  # adjust bar widths manually
  mutate(biweek = if_else(wday(biweek) == 5, biweek + 0.75, biweek - 0.75)) |>
  select(biweek, cases_student, cases_staff) |>
  pivot_longer(-biweek) |>
  mutate(name = str_remove(name, 'cases_') |> str_to_title()) |>
  ggplot() +
  geom_vline(
    data = tibble(x = as_date('2022-02-03')),
    aes(xintercept = x),
    lty = 2,
    color = 'gray'
  ) +
  geom_text(
    data = tibble(
      x = as_date('2022-02-05'),
      y = 18,
      label = 'End of data'
    ),
    aes(x, y, label = label),
    size = 2,
    hjust = 0
  ) +
  geom_col(
    aes(biweek, value, fill = name),
    size = 0.15,
    width = 3.5,
    color = 'gray80',
    position = position_stack()
  ) +
  labs(fill = NULL, title = 'UOttawa COVID-19 Cases') +
  scale_fill_carto_d() +
  scale_x_study_dates() +
  theme_no_x_labels() +
  theme_remove_grid() +
  theme(
    legend.position = c(0.9, 0.4),
    legend.key.size = unit(2, 'mm'),
    legend.background = element_rect(color = 'grey80')
  )

fig1_swabs <- 
  swabs_biweekly |> 
  ggplot(aes(biweek, positivity)) +
  geom_smooth(
    linewidth = 0.5, 
    alpha = 0.2, 
    fill = theme_color, 
    span = 0.4,
    method = 'loess',
    formula = 'y ~ x', 
    na.rm = T) +
  geom_point(color = theme_color, alpha = 0.85, shape = 1, size = 1) +
  scale_x_study_dates() +
  labs(title = 'Swab Positivity (%)') +
  theme_no_x_labels() +
  theme_remove_grid()
  
fig1_co2 <-
  ggplot(swabs_biweekly, aes(biweek, co2_mean)) +
  geom_point(color = theme_color, shape = 1, alpha = 0.85, size = 1) +
  geom_smooth(fill = theme_color, alpha = 0.2, size = 0.5) +
  labs(x = NULL, y = NULL, title = 'CO~2~ (ppm)') +
  scale_x_study_dates() +
  theme_no_x_labels() +
  theme_remove_grid()
  
fig1_wifi <-
  wifi_biweekly |> 
  ggplot(aes(biweek, mean)) +
  geom_smooth(span = 0.5, 
              size = 0.5,
              alpha = 0.2, 
              fill = theme_color) +
  geom_point(color = theme_color,  alpha = 0.85, shape = 1, size = 1) +
  labs(title = "Wi-Fi Connections") +
  scale_x_study_dates() +
  theme_no_x_labels() +
  theme_remove_grid()

fig1_uo_ww <-
  ggplot(uottawa_ww_biweekly,
         aes(biweek, signal)) +
  geom_smooth(span = 0.4, alpha = 0.2, size = 0.5,
              fill = theme_color, na.rm = T) +
  geom_point(
    # aes(size = n),
    alpha = 0.85, shape = 1, size = 1,
    color = theme_color,
    na.rm = T,
    show.legend = F) +
  labs(
    title = 'University Waste-Water',
  ) +
  scale_x_study_dates() +
  scale_size(range = c(1, 3)) +
  theme_no_x_labels() +
  theme_remove_grid()

fig1_ott_ww <- 
  ggplot(regional_ww_daily) + 
  geom_smooth(aes(sample_date, mean), 
              na.rm = T,
              method = 'loess', formula = 'y ~ x',
              span = 0.2, size = 0.5, alpha = 0.2,
              fill = theme_color) +
  geom_point(data = regional_ww_biweekly,
             aes(biweek, mean),
             na.rm = T,
             alpha = 0.85, shape = 1, size = 1,
             color = theme_color) +
  labs(x = 'Date',
       title = 'Regional Waste-Water',
       ) +
  scale_x_study_dates() +
  theme_remove_grid() +
  theme(axis.title.x = ggtext::element_markdown(lineheight = 1),
        axis.text.x = ggtext::element_markdown(size = 6.7)) 

figure_1 <- wrap_plots(
    fig1_timeline, fig1_cases, fig1_swabs, fig1_co2, 
    fig1_wifi, fig1_uo_ww, fig1_ott_ww
  ) +
  plot_layout(ncol = 1, nrow = 7, heights = c(1, rep(1.5, 6)))

rm(fig1_timeline, fig1_swabs, fig1_co2, fig1_wifi, fig1_uo_ww, fig1_ott_ww, fig1_cases)



## Figure 2: Corr heatmap -----------------------------------

figure_2 <- corrr::autoplot(corr_spearman, triangular = "lower",
                             barheight = 10) +
  geom_text(aes(label = r |> round(2)), size = 3.5)
  

## Figure 3: Per-building res ---------------------------------

#### thematic elements and axis scales -----

# limits are based on the min/max of data to keep scales consistent across all plots of a given variable
scale_y_co2 <- function() scale_y_continuous(limits = c(400, 950))
scale_y_copies <- function() scale_y_log10(limits = c(1, 300))
scale_y_cases <- function() {
  scale_y_continuous(breaks = seq(0, 5, 2), limits = c(0,5))
}
scale_y_wifi <- function() scale_y_continuous(limits = c(0, 1000))

scale_color_traffic <- function(){
  scale_color_carto_d(
    palette = 2,
    breaks = c('High', 'Low'),
    labels = c('High', 'Low'),
    drop = F
  )
}

theme_axes_lb <- function(){
  theme_remove_grid() +
  theme(
    panel.border = element_blank(),
    axis.line = element_line(colour = "gray30", linewidth = rel(0.5)),
    plot.margin = margin(2,2,2,2, 'mm')
  )
}

remove_y_axis_if_not_leftmost <- function(p, site){
  if (site %in% c('DMS', 'MRT')) return(p)
  p + theme_no_y_labels()
}

#### individual plot functions -----
plot_site_cases <- function(data, site){
  p <- data |>
    ggplot(aes(biweek, cases, 
               fill = factor(role, levels = c('Employee', 'Student')))
           ) +
    geom_col(linewidth = .1, width = 2.5, color = 'grey70') +
    geom_vline(data = tibble(xint = as_date('2022-02-02')),
               aes(xintercept = xint), lty = 2, alpha = 0.35) +
    scale_x_study_dates() +
    scale_y_cases() +
    scale_fill_carto_d(
      palette = 1,  
      breaks = c('Employee', 'Student'),
      labels = c('Employee', 'Student'),
      drop = FALSE,
      name = 'Role') +
    labs(fill = 'Cases', y = 'Cases', x = NULL, title = site) +
    theme_no_x_labels() +
    theme_axes_lb() +
    theme(axis.title.y = element_markdown()) 
  remove_y_axis_if_not_leftmost(p, site)
}

# swabs
plot_site_swabs <- function(data, site){
  data_early <- data |> filter(date_swab < '2022-01-01')
  data_late <- data |> filter(date_swab > '2022-01-01')
  
  p <- data_early |>
    ggplot(aes(date_swab, copies_plus_one, color = traffic)) +
    geom_path(alpha = 0.8, linewidth = 0.6) +
    geom_point(size = 0.6, alpha = 0.8) +
    geom_path(data = data_late, alpha = 0.8, linewidth = 0.6) +
    geom_point(data = data_late, size = 0.6, alpha = 0.8) +
    scale_x_study_dates() +
    scale_y_copies() +
    scale_color_traffic() +
    labs(x = NULL, y = 'Copies + 1', color = 'Traffic Level') +
    theme_no_x_labels() +
    theme_axes_lb() +
    theme(axis.title.y = element_markdown(),
          legend.text = element_markdown())
  remove_y_axis_if_not_leftmost(p, site)
}

# co2
plot_site_co2 <- function(data, site){
  data_early <- data |> filter(date_swab < '2022-01-01')
  data_late <- data |> filter(date_swab > '2022-01-01')
  
  p <- data_early |> 
    ggplot(aes(date_swab, co2, color = traffic)) +
    geom_path(alpha = 0.8) +
    geom_point(alpha = 0.8, size = 0.6) +
    geom_path(data = data_late, alpha = 0.8) +
    geom_point(data = data_late, alpha = 0.8, size = 0.6) +
    scale_x_study_dates() +
    scale_y_co2() +
    scale_color_traffic() +
    labs(y = 'CO~2~', x = NULL) +
    theme_no_x_labels() +
    theme_axes_lb() +
    theme(axis.title.y = element_markdown(), legend.position = 'none')
  remove_y_axis_if_not_leftmost(p, site)
}

# wifi
plot_site_wifi <- function(data, site){
  if (is.null(data)) return(patchwork::plot_spacer())
  
  p <- data |> 
    ggplot(aes(date, clients)) +
    geom_path(alpha = 0.5) +
    scale_x_study_dates() +
    scale_y_wifi() +
    labs(y = 'Wifi', x = NULL) +
    theme_axes_lb() +
    theme(
      axis.title.y = element_markdown(),
      axis.text.x = element_markdown(size = 6))
  remove_y_axis_if_not_leftmost(p, site)
}

## Plot datasets ====
fig3_cases <- 
  cases |> 
  unnest(associated_sites) |> 
  pivot_longer(UC:TBT, names_to = 'site') |> 
  mutate(site = if_else(site == 'UC', '90U', site)) |> 
  filter(value, 
         case_date > min(swabs_tidy$date_swab - 5),
         case_date < max(swabs_tidy$date_swab + 5),
         ) |> 
  group_by(site, biweek, role) |> 
  summarise(cases = n(),
            dates = list(case_date),
            .groups = 'drop') |>
  group_nest(site, .key = "cases")

fig3_swabs <- 
  swabs_tidy |> 
  mutate(
    site = str_remove(site, 'UO_'),
    traffic = get_traffic_level(location),
    copies = convert_cq_to_copies(pcr_ct),
    copies_plus_one = if_else(is.na(copies), 1, copies + 1),
    ) |> 
  select(site, traffic, date_swab, copies_plus_one) |>
  group_nest(site, .key = "swabs")

fig3_wifi <- 
  wifi |> 
  mutate(site = str_remove(site, 'UO_')) |> 
  select(date, site, clients) |>
  group_nest(site, .key = "wifi")

fig3_co2 <- 
   swabs_tidy |> 
  transmute(
    site = str_remove(site, 'UO_'),
    traffic = get_traffic_level(location),
    date_swab,
    co2,
    ) |> 
  group_nest(site, .key = "co2") |> 
  print()
## # A tibble: 6 × 2
##   site                 co2
##   <chr> <list<tibble[,3]>>
## 1 90U             [98 × 3]
## 2 DMS             [86 × 3]
## 3 LPR             [98 × 3]
## 4 MNT             [98 × 3]
## 5 MRT             [90 × 3]
## 6 TBT             [84 × 3]
#### data handling and making building panels -----

# combine nested df
fig3_df_nest <- fig3_cases |> 
  left_join(fig3_swabs, by = join_by(site)) |> 
  left_join(fig3_wifi, by = join_by(site)) |> 
  left_join(fig3_co2, by = join_by(site))

# map data to plots, combine into panel for each site
fig3_plts <- fig3_df_nest |>
  arrange(site == '90U') |> 
  transmute(
    site,
    plt_cases = map2(cases, site, plot_site_cases),
    plt_swabs = map2(swabs, site, plot_site_swabs),
    plt_co2 = map2(co2, site, plot_site_co2),
    plt_wifi = map2(wifi, site, plot_site_wifi),
  ) |> 
  mutate(panel = pmap(
    .l = list(site, plt_cases, plt_swabs, plt_co2, plt_wifi),
    .f = function(site,plt_cases, plt_swabs, plt_co2, plt_wifi) {
      p <- 
        patchwork::wrap_plots(
          plt_cases, plt_swabs, plt_co2, plt_wifi, ncol = 1
        ) 
      
      if (site %in% c('90U', 'MNT')) return(p)
      p + patchwork::plot_annotation(theme = theme(
        axis.title.y = element_blank(),
        axis.text.y = element_blank()
      ))
    }))

#### Combine all 6 sites' plots into 2*3 panel -----

figure_3 <- 
  patchwork::wrap_plots(
    fig3_plts$panel, 
    tag_level = 'keep',
    guides = 'collect'
  )

Results

Abstract Results

cat(rs_abstract)

Over the 32-week study period, we collected 554 swabs at six university buildings, including two buildings where wastewater surveillance was currently ongoing. Overall, 13% of swabs were PCR-positive for SARS-CoV-2, with individual buildings ranging between 4.8% and 32.7%. Comment on when spikes in cases, swabs, and waste-water occurred…. Comment on prediction of cases using swabs, ww, or combined approach…


Tables

Summary table

# stack summaries
swab_summary |> 
  mutate(site = 'Overall') |> 
  bind_rows(swab_summary_sites) |> 
  relocate(site) |> 
  set_names(c('Site', 'N', 'PCR-Positive', '%')) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = 'Building Summary'
  )
Building Summary
Site N PCR-Positive %
Overall 554 72 13.0
UO_90U 98 32 32.7
UO_DMS 86 5 5.8
UO_MRT 90 13 14.4
UO_MNT 98 9 9.2
UO_TBT 84 4 4.8
UO_LPR 98 9 9.2

Spearman correlation between campus-wide measures

kableExtra::kable(
  x = corr_ci, 
  format = 'pipe',
  caption = 'Spearman r for biweekly, campus-wide measures'
  )
Spearman r for biweekly, campus-wide measures
terms lower r upper p
SwPCR-CO2 -0.4210420 -0.1586261 0.1282930 0.2763179
SwPCR-Wi-Fi -0.4983910 -0.2432931 0.0505771 0.1032564
SwPCR-UnvWW 0.3157817 0.5528179 0.7249053 0.0000559
SwPCR-OttWW 0.4973249 0.6830058 0.8088547 0.0000001
SwPCR-Cases 0.5282461 0.7434316 0.8688837 0.0000017
CO2-Wi-Fi -0.2225151 0.0724638 0.3552879 0.6322392
CO2-UnvWW -0.3284296 -0.0455597 0.2448100 0.7610603
CO2-OttWW -0.4365919 -0.1771429 0.1095086 0.2233583
CO2-Cases -0.5112380 -0.1916081 0.1745854 0.3017934
Wi-Fi-UnvWW -0.5260623 -0.2715997 0.0274979 0.0745187
Wi-Fi-OttWW -0.5266926 -0.2790626 0.0122259 0.0603663
Wi-Fi-Cases -0.6351605 -0.3564595 0.0043710 0.0531745
UnvWW-OttWW 0.3810857 0.6023358 0.7583331 0.0000075
UnvWW-Cases 0.0818404 0.4294475 0.6839052 0.0178699
OttWW-Cases 0.1761573 0.4993295 0.7253344 0.0042398

Figures

(figure_1) 

Figure 1: Time-series of (top to bottom) important events at the university, proportion of PCR-positive swabs, biweekly mean ambient CO2 (across collection sites), biweekly mean peak number of Wi-Fi connections (across 5 buildings), biweekly mean waste-water signal (across up to 6 collection sites; relative to PPMoV), biweekly mean regional waste-water detection (relative to PPMoV). Points show biweekly means. Trend lines show the LOESS fit.



figure_2

rm(corr_spearman)

Figure 2. Spearman correlation between biweekly campus-wide variables: self-reported cases, floor swab positivity (Swab PCR), mean CO2, mean daily peak Wi-Fi connections, aggregate waste-water signal at the university (University WW) and regional level (Ottawa WW),



figure_3

Figure 3. Campus COVID-19 cases, swab results, CO2 concentrations (during swab collection), and daily peak Wi-Fi connections by building. Cases plots show counts of self-reported for students and employees separately; several cases are associated with multiple buildings. Case data collection was abandoned in early February 2022 (dashed line). Swabs and CO plots show results at two locations within each building, with one sample collected in a high-traffic area and the other in a low-traffic area. Swab PCR results are expressed as the number of SARS-CoV-2 RNA copies plus one, on a log-scale. Points represent the result for a single swab. Wi-fi plots show the peak daily number of simultaneous connections per building. No Wi-Fi data was available for the 90U building.

## save figures ----
# figs 1 & 3 don't work as a pdf for some reason
ggsave('fig/figure_1.png', figure_1, device = 'png', width = 7, height = 7) 
ggsave('fig/figure_2.pdf', figure_2, device = 'pdf', height = 4, width = 4)
ggsave('fig/figure_2.png', figure_2, device = 'png', height = 4, width = 4)
ggsave('fig/figure_3.png', figure_3, device = 'png', height = 8, width = 12)


figure_sites_map

Map: Where are these collection sites? Pink markers represent waste-water collection sites; purple markers represent buildings where surface testing was performed. Only one building, Tabaret Hall, had both surface and waste-water testing. The locations of two waste-water sampling sites could not be ascertained and are not displayed on the map.

It remains unclear which buildings the ‘South Sampling Site’ and ‘North Sampling Site’ catchment areas are related to (not mapped).



Models

  • Hypothesis testing. HA: swab-PCR positivity, CO2, wifi, and WW signals are predictive of number of cases. H0: no relationship.
  • Method: Backwards selection of significant predictors, ANOVA
  • Formula: cases ~ swab_pcr + co2 + wifi + {university_ww} + regional_ww + (all interactions...).
  • Issue: only 10 / 50 biweekly obs are complete cases
    • cases counted only until Feb 2022 (~40% miss).
    • university_ww from Dec 2021 (~40% miss).
    • wifi occasional missing data.
  • Question: do we want standardized effects or simple effects? (I standardized because we are mostly interested in comparing different predictors within a model, as opposed to across different data sets.)
  • Question: do we want to consider interactions? (I did and didn’t)


# n=50 biweekly obs; uni_ww missing first half of period...
df_mod <- biweekly_data |> 
  janitor::clean_names() |> 
  mutate(across(-c(cases, biweek), scale)) # rescale

# only 10 complete observations, oof
df_mod_complete_obs <- df_mod |> na.omit()

# missing data
naniar::vis_miss(df_mod) +
  labs(subtitle = 'Aggregated TS Missingness')



Case count distribution

Cases counts seem to follow a negative binomial distribution more than Poisson.

# simulated curves
add_poisson_density_curve <- function(p) {
  p + geom_density(
    data = tibble(
      x = rpois(n = sum(!is.na(df_mod$cases)),
                lambda = mean(df_mod$cases, na.rm = T))
      ), 
    aes(x), color = 'blue', alpha = 0.25, size = 0.03
  )
}
add_nb_density_curve <- function(p) {
  p +     
    geom_density(
      data = tibble(x = rnbinom(
        size = 1,
        n = sum(!is.na(df_mod$cases)),
        mu = mean(df_mod$cases, na.rm = T),
      )), 
      aes(x), color = 'blue', alpha = 0.25, size = 0.03
    )
}
simulation_plot <- function(plt_fn, title){
  p <- df_mod |> ggplot()
  walk(seq(100), ~ {p <<- plt_fn(p)})
  p + geom_density(aes(cases)) + labs(x = 'Cases', subtitle = title)
}

# Poisson dist
p1 <- simulation_plot(
  add_poisson_density_curve,
  'Cases vs. Simulated poisson distribution'
  )

# NB dist
p2 <- simulation_plot(
  add_nb_density_curve,
  'Cases vs. Simulated negative binomial distribution'
  )

(p1 / p2)

rm(p1, p2, add_poisson_density_curve, add_nb_density_curve)



Poisson Regression, Backwards Selection


Poisson Model with all interactions

Backwards selection arrived at the model with swab_pcr, wifi, ottawa_ww, and the interaction between ottawa_ww & swab_pcr.

# 30 complete obs when excluding university ww
df_mod_no_uniww <- df_mod |> select(-university_ww) |> na.omit()

# poisson model for cases with interactions
full_model <- glm(
  cases ~ . ^ 2,
  data = df_mod_no_uniww |> select(-biweek), 
  family = 'poisson')

# do backwards selection
backwards_selected <- step(full_model, trace = F)
stopifnot(backwards_selected$converged)

# standardized coefs and 95% CI
backwards_selected |> 
  broom::tidy(conf.int = T, conf.level = 0.95) |> 
  mutate(across(where(is.double), ~round(., 4))) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Poisson model with interactions: Coefficients & 95% CI"
    )
Poisson model with interactions: Coefficients & 95% CI
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 1.1771 0.2318 5.0771 0.0000 0.6901 1.6033
swab_pcr 0.9139 0.2028 4.5056 0.0000 0.5191 1.3208
wi_fi 0.9444 0.3402 2.7761 0.0055 0.2810 1.6216
ottawa_ww 2.2294 0.5458 4.0847 0.0000 1.1979 3.3465
swab_pcr:ottawa_ww -0.5505 0.1853 -2.9705 0.0030 -0.9276 -0.1979
# use type III SS tests since swab*ww interaction is significant
car::Anova(backwards_selected, type = 3) |> 
  rownames_to_column(var = 'Term') |> 
  as_tibble() |> 
    kableExtra::kable(
    format = 'pipe',
    caption = "Type III ANOVA"
    )
Type III ANOVA
Term LR Chisq Df Pr(>Chisq)
swab_pcr 19.958024 1 0.0000079
wi_fi 7.782973 1 0.0052741
ottawa_ww 19.297456 1 0.0000112
swab_pcr:ottawa_ww 9.616743 1 0.0019281
# test model assumption of equidispersion
# p > 0.05 means no significant overdispersion
AER::dispersiontest(backwards_selected)
## 
##  Overdispersion test
## 
## data:  backwards_selected
## z = 0.51985, p-value = 0.3016
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.120566
Backward selection model summary and diagnostics
summary(backwards_selected)
## 
## Call:
## glm(formula = cases ~ swab_pcr + wi_fi + ottawa_ww + swab_pcr:ottawa_ww, 
##     family = "poisson", data = select(df_mod_no_uniww, -biweek))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8448  -0.8735  -0.6446   0.3903   2.1520  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.1771     0.2318   5.077 3.83e-07 ***
## swab_pcr             0.9139     0.2028   4.506 6.62e-06 ***
## wi_fi                0.9444     0.3402   2.776  0.00550 ** 
## ottawa_ww            2.2294     0.5458   4.085 4.41e-05 ***
## swab_pcr:ottawa_ww  -0.5505     0.1853  -2.971  0.00297 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 182.217  on 29  degrees of freedom
## Residual deviance:  30.924  on 25  degrees of freedom
## AIC: 90.491
## 
## Number of Fisher Scoring iterations: 5
plot(backwards_selected)

hist(backwards_selected$residuals)

rm(full_model)



Poisson Main Effects Only

Poisson model chosen by backwards selection with no interactions identifies swab positivity and Ottawa WW signal as significant predictors.

# backwards select with n=30, no university ww
me_model <- glm(
  cases ~ swab_pcr + co2 + wi_fi + ottawa_ww,
  data = df_mod_no_uniww, 
  family = 'poisson')

backwards_selected_me <- step(me_model, trace = F)
stopifnot(backwards_selected_me$converged)

# nearly significant overdispersion when only main effects included
AER::dispersiontest(backwards_selected_me)
## 
##  Overdispersion test
## 
## data:  backwards_selected_me
## z = 1.6019, p-value = 0.05459
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.346469
# regression coefs and 95%CI
backwards_selected_me |> 
  broom::tidy(conf.int = T, conf.level = 0.95) |> 
  mutate(across(where(is.double), ~round(., 3))) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Main Effects Only: Coefficients & 95% CI"
    )
Main Effects Only: Coefficients & 95% CI
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.561 0.164 3.427 0.001 0.221 0.865
swab_pcr 0.471 0.147 3.201 0.001 0.180 0.757
ottawa_ww 0.801 0.251 3.194 0.001 0.318 1.300
# type II SS test
car::Anova(backwards_selected_me, type = 2) |> 
  rownames_to_column(var = 'Term') |> 
  as_tibble() |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Type II ANOVA"
  )
Type II ANOVA
Term LR Chisq Df Pr(>Chisq)
swab_pcr 10.00096 1 0.0015646
ottawa_ww 10.80594 1 0.0010118
Backward selection model summary and diagnostics
summary(backwards_selected_me)
## 
## Call:
## glm(formula = cases ~ swab_pcr + ottawa_ww, family = "poisson", 
##     data = df_mod_no_uniww)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6786  -1.2634  -0.8722   0.5679   2.1224  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.5610     0.1637   3.427 0.000611 ***
## swab_pcr      0.4707     0.1471   3.201 0.001371 ** 
## ottawa_ww     0.8006     0.2506   3.194 0.001402 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 182.22  on 29  degrees of freedom
## Residual deviance:  41.42  on 27  degrees of freedom
## AIC: 96.987
## 
## Number of Fisher Scoring iterations: 5
plot(backwards_selected_me)

hist(backwards_selected_me$residuals)

rm(me_model)



NB model, backwards selection

NB model with main effects also identifies cases ~ swab + ottawa_ww as optimal model (as with Poisson model). NB fit didn’t converge when all interactions are included in the model.


Main effects, no interactions

# negative binomial regression; main effects only
full_model_nb <- MASS::glm.nb(
  cases ~ ., 
  data = df_mod_no_uniww |> select(-biweek),
  control = glm.control(maxit = 5000)
  )

backwards_selected_nb <- step(full_model_nb, trace = F)
stopifnot(backwards_selected_nb$converged)

backwards_selected_nb |> 
  broom::tidy(conf.int = T, conf.level = 0.95) |> 
  mutate(across(where(is.double), ~round(., 4))) |> 
  rownames_to_column(var = 'Term') |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "NB model -no interactions: Coefficients & 95% CI"
    )
NB model -no interactions: Coefficients & 95% CI
Term term estimate std.error statistic p.value conf.low conf.high
1 (Intercept) 0.5514 0.1708 3.2276 0.0012 0.1989 0.8681
2 swab_pcr 0.4839 0.1621 2.9864 0.0028 0.1639 0.7990
3 ottawa_ww 0.8059 0.2713 2.9699 0.0030 0.2811 1.3443
Model summary and diagnostics
# not the worst fit, could be much better
summary(backwards_selected_nb)
## 
## Call:
## MASS::glm.nb(formula = cases ~ swab_pcr + ottawa_ww, data = select(df_mod_no_uniww, 
##     -biweek), control = glm.control(maxit = 5000), init.theta = 31.03709038, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4918  -1.2413  -0.8631   0.5150   2.0839  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.5514     0.1708   3.228  0.00125 **
## swab_pcr      0.4839     0.1621   2.986  0.00282 **
## ottawa_ww     0.8059     0.2713   2.970  0.00298 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(31.0371) family taken to be 1)
## 
##     Null deviance: 163.247  on 29  degrees of freedom
## Residual deviance:  38.936  on 27  degrees of freedom
## AIC: 98.879
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  31.0 
##           Std. Err.:  91.3 
## 
##  2 x log-likelihood:  -90.879
plot(backwards_selected_nb)

hist(backwards_selected_nb$residuals)

rm(full_model_nb)

The negative binomial model (with extra parameter \(\theta\)) does not significantly improve the model fit compared to the poisson model (p > 0.05) according to LRT.

library(lmtest)
# poisson vs. nb models with main effects only
lmtest::lrtest(backwards_selected_me, backwards_selected_nb)
## Likelihood ratio test
## 
## Model 1: cases ~ swab_pcr + ottawa_ww
## Model 2: cases ~ swab_pcr + ottawa_ww
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   3 -45.494                     
## 2   4 -45.440  1 0.1083     0.7421
NB fit fails with interactions
# swab pcr and 
full_model_nb_int <- MASS::glm.nb(
  cases ~ .*., 
  data = df_mod_no_uniww |> select(-biweek) |> na.omit(),
  control = glm.control(maxit = 15000)
  )
# do selection
try((backwards_selected_nb_int <- step(full_model_nb_int)))
## Start:  AIC=-234
## cases ~ (swab_pcr + co2 + wi_fi + ottawa_ww) * (swab_pcr + co2 + 
##     wi_fi + ottawa_ww)
## 
##                      Df Deviance     AIC
## - swab_pcr:wi_fi      1   11.247 -252.49
## - swab_pcr:co2        1   20.301 -243.44
## - co2:ottawa_ww       1   20.377 -243.36
## - wi_fi:ottawa_ww     1   21.188 -242.55
## - co2:wi_fi           1   27.794 -235.94
## <none>                    27.737 -234.00
## - swab_pcr:ottawa_ww  1   39.531 -224.21
## 
## Step:  AIC=98.26
## cases ~ swab_pcr + co2 + wi_fi + ottawa_ww + swab_pcr:co2 + swab_pcr:ottawa_ww + 
##     co2:wi_fi + co2:ottawa_ww + wi_fi:ottawa_ww
## 
## Call:  MASS::glm.nb(formula = cases ~ swab_pcr + co2 + wi_fi + ottawa_ww + 
##     swab_pcr:co2 + swab_pcr:ottawa_ww + co2:wi_fi + co2:ottawa_ww + 
##     wi_fi:ottawa_ww, data = na.omit(select(df_mod_no_uniww, -biweek)), 
##     control = glm.control(maxit = 15000), init.theta = 261853298.3, 
##     link = log)
## 
## Coefficients:
##        (Intercept)            swab_pcr                 co2               wi_fi  
##            2.16837             1.28306             0.10333             1.64716  
##          ottawa_ww        swab_pcr:co2  swab_pcr:ottawa_ww           co2:wi_fi  
##            3.81150             0.43175            -0.43881             0.08261  
##      co2:ottawa_ww     wi_fi:ottawa_ww  
##           -0.56701             1.09738  
## 
## Degrees of Freedom: 29 Total (i.e. Null);  20 Residual
## Null Deviance:       182.2 
## Residual Deviance: 28.69     AIC: 100.3
rm(full_model_nb_int)



Model with university WW

If we consider the ten biweekly observations where we have linked cases, swabs, and university WW data,

# backwards select with university ww (n=10 biweekly obs)
full_model_w_uni_ww <- glm(
  cases ~ .,
  data = df_mod_complete_obs |> select(-biweek), 
  family = 'poisson')

# do backwards selection
backwards_selected_ww <- step(full_model_w_uni_ww, trace = F)
stopifnot(backwards_selected_ww$converged)


# standardized coefs and 95% CI
backwards_selected_ww |> 
  broom::tidy(conf.int = T, conf.level = 0.95) |> 
  mutate(across(where(is.double), ~round(., 4))) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = "Poisson model with interactions: Coefficients & 95% CI"
    )
Poisson model with interactions: Coefficients & 95% CI
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.8023 0.1952 4.1098 0.0000 0.3916 1.1602
swab_pcr 0.4296 0.1703 2.5222 0.0117 0.0922 0.7620
wi_fi 0.6311 0.2910 2.1685 0.0301 0.0640 1.2064
university_ww 0.1950 0.0727 2.6827 0.0073 0.0487 0.3374
ottawa_ww 1.5437 0.4622 3.3396 0.0008 0.7143 2.5296
# use type II SS test
car::Anova(backwards_selected_ww, type = 2) |> 
  rownames_to_column(var = 'Term') |> 
  as_tibble() |> 
    kableExtra::kable(
    format = 'pipe',
    caption = "Type II ANOVA"
    )
Type II ANOVA
Term LR Chisq Df Pr(>Chisq)
swab_pcr 6.184606 1 0.0128866
wi_fi 4.760989 1 0.0291118
university_ww 6.635196 1 0.0099983
ottawa_ww 15.792780 1 0.0000707
# test model assumption of equidispersion
# p > 0.05 means no significant overdispersion
AER::dispersiontest(backwards_selected_ww)
## 
##  Overdispersion test
## 
## data:  backwards_selected_ww
## z = 0.5552, p-value = 0.2894
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   1.111954
Model summary and diagnostics
summary(backwards_selected_ww)
## 
## Call:
## glm(formula = cases ~ swab_pcr + wi_fi + university_ww + ottawa_ww, 
##     family = "poisson", data = select(df_mod_complete_obs, -biweek))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7404  -1.1126  -0.2324   0.5563   2.0642  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.80226    0.19521   4.110 3.96e-05 ***
## swab_pcr       0.42957    0.17032   2.522 0.011664 *  
## wi_fi          0.63114    0.29104   2.169 0.030118 *  
## university_ww  0.19496    0.07267   2.683 0.007303 ** 
## ottawa_ww      1.54375    0.46225   3.340 0.000839 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 176.454  on 28  degrees of freedom
## Residual deviance:  31.953  on 24  degrees of freedom
## AIC: 91.52
## 
## Number of Fisher Scoring iterations: 5
plot(backwards_selected_ww)

hist(backwards_selected_ww$residuals)

rm(full_model_w_uni_ww)

GLMM: High v Low Traffic

swab_summary_location_traffic |> 
  set_names(c('Traffic level', 'N', 'PCR-Positive', '%')) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = 'Positivity by sample location traffic-level'
  )
Positivity by sample location traffic-level
Traffic level N PCR-Positive %
high traffic 280 40 14.3
low traffic 274 32 11.7

To evaluate whether high traffic locations are different from low traffic sites, in terms of swab-PCR positivity, we fit a mixed model with random intercepts for different sites. There appears to be little difference (15% vs 12% overall) and this is confirmed by our mixed model with traffic level as a fixed effect (OR 0.71; 95% CI 0.82-2.2).

traffic_swabs <- 
  swabs_tidy |> 
  mutate(
    traffic = get_traffic_level(location) |> 
      str_remove('\\straffic') |> 
      factor(levels = c('low', 'high'))
    ) |> 
  select(site, traffic, pcr_positive)

mixed_model <- 
  lme4::glmer(pcr_positive ~ traffic + (1 | site), data = traffic_swabs,
            family = 'binomial')

mixed_model |> 
  broom.mixed::tidy(conf.int = T, exponentiate = T) |> 
  mutate(across(where(is.double), ~round(., 4))) |> 
  kableExtra::kable(
    format = 'pipe',
    caption = 'Traffic level - Random intercepts model'
  )
Traffic level - Random intercepts model
effect group term estimate std.error statistic p.value conf.low conf.high
fixed NA (Intercept) 0.1060 0.0377 -6.3105 0.0000 0.0528 0.2128
fixed NA traffichigh 1.2800 0.3341 0.9457 0.3443 0.7674 2.1349
ran_pars site sd__(Intercept) 0.7082 NA NA NA NA NA

Model: Cases w/in buildings

study_period_weeks <- crossing(
  date = as_date(as_date('2021-09-21'):as_date('2022-01-31')),
  site = c('90U', 'DMS', 'LPR', 'MNT', 'MRT', 'TBT')
  ) |> 
  mutate(
    week = get_date_week(date),
    redate = week_to_date(week)) |> 
  distinct(site, week)

# group swabs by week & site
per_bldg_swabs <- 
  swabs_tidy |> 
  filter(date_swab < '2022-02-02') |> 
  mutate(
    site = str_remove(site, 'UO_'),
    week = get_date_week(date_swab) |> as.character(),
  ) |> 
  group_by(site, week) |> 
  summarise(
    positivity = sum(pcr_positive == 'Positive') / n(),
    .groups = 'drop')

# 72 cases associated with study bldgs (at least one)
per_bldg_cases <- 
  cases |> 
  filter(case_date > '2021-09-20') |> 
  unnest(associated_sites) |> 
  pivot_longer(UC:TBT, names_to = 'site') |> 
  filter(value) |> 
  select(-value, -biweek, -case) |> 
  mutate(
    site = str_replace(site, 'UC', '90U'),
    week = get_date_week(case_date)) |> 
  group_by(week, site) |> 
  summarise(cases = n(), .groups = 'drop') 

per_bldg_df <- per_bldg_swabs |> 
  left_join(per_bldg_cases, by = join_by(site, week)) |> 
  mutate(cases = if_else(is.na(cases), 0, cases))

rm(study_period_weeks, per_bldg_cases, per_bldg_swabs)

Poisson GLM w random intercepts: Cases ~ positivity + (1|site)

per_bldg_model <- lme4::glmer(
  cases ~ positivity + (1|site),
  data = per_bldg_df,
  family = 'poisson'
)
broom.mixed::glance(per_bldg_model)
## # A tibble: 1 × 7
##    nobs sigma logLik   AIC   BIC deviance df.residual
##   <int> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1    90     1  -64.4  135.  142.     82.9          87
# Wald CIs
broom.mixed::tidy(per_bldg_model, conf.int = T, conf.level = 0.95)
## # A tibble: 3 × 9
##   effect   group term          estim…¹ std.e…² stati…³   p.value conf.…⁴ conf.…⁵
##   <chr>    <chr> <chr>           <dbl>   <dbl>   <dbl>     <dbl>   <dbl>   <dbl>
## 1 fixed    <NA>  (Intercept)    -1.99    0.397   -5.01  5.38e- 7   -2.77   -1.21
## 2 fixed    <NA>  positivity      3.20    0.496    6.47  1.00e-10    2.23    4.18
## 3 ran_pars site  sd__(Interce…   0.622  NA       NA    NA          NA      NA   
## # … with abbreviated variable names ¹​estimate, ²​std.error, ³​statistic,
## #   ⁴​conf.low, ⁵​conf.high
points_df <- crossing(
  positivity = seq(0, 1, 0.1),
  site = unique(per_bldg_df$site) 
)

points_df |> 
  mutate(   
    pred = predict(
      type = 'response',
      object = per_bldg_model, 
      newdata = points_df
    )) |> 
  ggplot(aes(positivity, pred, color = site)) +
  geom_smooth(se = F, linewidth = 0.25, alpha = 1) + 
  geom_jitter(data = per_bldg_df,
              alpha = 0.75, shape = 1, size = 0.85,
              width = 0.01, height = 0.01,
              aes(positivity, cases)) +
  labs(x = 'Swab positivity', y = 'Cases', color = 'Site')

broom.mixed::tidy(per_bldg_model, 'ran_vals', 
                  conf.int = T, conf.level = 0.95) |> 
  ggplot(aes(y = level,
             x = estimate, xmin = conf.low, xmax = conf.high)) +
  geom_pointrange()


Case Data Imputation…

Multiple imputation (mice) has a key assumption that data is MAR. But data is (arguably) not MAR, but because of some dynamics not measured in the data (like the university giving up hope on recording the data in 2022).

Using mice means making the analysis much more complicated… Generally, you need to model each imputed dataset and then analyze the pooled results. That gets a lot more complicated when needing to first aggregate the data, join to a bunch of other data, do various modelling tasks…. all nested within each iteration.

Instead, I used a simpler ad-hoc method:

  • if positive-test-result date is available, use that, else:
  • use -5 days from end-of-isolation date, or if not available,
  • use +3 days from symptoms-began date

It’s not really the most sound approach, but the data is a bit questionable to begin with anyway…


R Software details
  • Statistical analysis was performed using R version 4.2.2 (2022-10-31; cite Ihaka & Gentleman).

  • Generalized linear models were created using the glm function. Mixed models were created using the glmer function from the package lme4 (Bates et al.).

  • We fit poisson and negative binomial regression models with the number of campus-wide cases as the outcome; we applied backwards selection to evaluate predictors of campus-wide cases, including the aggregated results (means) of surface swabbing, waste-water testing, CO2 monitors, and Wi-Fi user counts.

  • We used mixed effects logistic regression to model the presence of SARS-CoV-2 infected individuals as a binary outcome (ie. positive event: one or more cases occurring during a week) in a university building, using surface swab PCR results as predictor with random intercepts for buildings.

  • Graphics were created with ggplot2 (v3.4.1)

  • Find our analysis code at our public github repository: https://github.com/CUBE-Ontario/UOttawa-Analysis

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.2.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] lmtest_0.9-40     zoo_1.8-11        rcartocolor_2.0.0 glue_1.6.2       
##  [5] janitor_2.1.0     ggtext_0.1.2      patchwork_1.1.2   ggiraph_0.8.6    
##  [9] here_1.0.1        lubridate_1.9.2   forcats_1.0.0     stringr_1.5.0    
## [13] dplyr_1.1.0       purrr_1.0.1       readr_2.1.4       tidyr_1.3.0      
## [17] tibble_3.1.8      ggplot2_3.4.1     tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.2            uuid_1.1-0              backports_1.4.1        
##   [4] systemfonts_1.0.4       splines_4.2.2           listenv_0.8.0          
##   [7] crosstalk_1.2.0         leaflet_2.1.1           digest_0.6.29          
##  [10] foreach_1.5.2           ca_0.71.1               htmltools_0.5.4        
##  [13] leaflet.providers_1.9.0 fansi_1.0.3             magrittr_2.0.3         
##  [16] memoise_2.0.1           tzdb_0.3.0              globals_0.15.0         
##  [19] extrafont_0.19          vroom_1.6.1             sandwich_3.0-2         
##  [22] extrafontdb_1.0         svglite_2.1.0           timechange_0.2.0       
##  [25] gfonts_0.2.0            colorspace_2.0-3        rvest_1.0.3            
##  [28] textshaping_0.3.6       xfun_0.31               crayon_1.5.1           
##  [31] jsonlite_1.8.4          AER_1.2-10              lme4_1.1-31            
##  [34] survival_3.4-0          iterators_1.0.14        kableExtra_1.3.4       
##  [37] registry_0.5-1          gtable_0.3.0            webshot_0.5.4          
##  [40] car_3.1-1               Rttf2pt1_1.3.12         abind_1.4-5            
##  [43] scales_1.2.0            fontquiver_0.2.1        Rcpp_1.0.8.3           
##  [46] viridisLite_0.4.0       xtable_1.8-4            gridtext_0.1.5         
##  [49] bit_4.0.4               Formula_1.2-4           fontLiberation_0.1.0   
##  [52] htmlwidgets_1.5.4       httr_1.4.5              ellipsis_0.3.2         
##  [55] pkgconfig_2.0.3         farver_2.1.0            sass_0.4.1             
##  [58] utf8_1.2.2              crul_1.3                tidyselect_1.2.0       
##  [61] labeling_0.4.2          rlang_1.0.6             later_1.3.0            
##  [64] munsell_0.5.0           cellranger_1.1.0        tools_4.2.2            
##  [67] cachem_1.0.7            cli_3.6.0               generics_0.1.2         
##  [70] corrr_0.4.4             broom_1.0.3             evaluate_0.15          
##  [73] fastmap_1.1.0           yaml_2.3.5              ragg_1.2.5             
##  [76] knitr_1.39              bit64_4.0.5             visdat_0.6.0           
##  [79] future_1.26.1           nlme_3.1-160            mime_0.12              
##  [82] xml2_1.3.3              compiler_4.2.2          rstudioapi_0.14        
##  [85] curl_4.3.2              bslib_0.3.1             stringi_1.7.6          
##  [88] highr_0.9               gdtools_0.3.1           hrbrthemes_0.8.0       
##  [91] lattice_0.20-45         Matrix_1.5-1            nloptr_2.0.3           
##  [94] commonmark_1.8.1        fontBitstreamVera_0.1.1 psych_2.2.9            
##  [97] markdown_1.4            vctrs_0.5.2             furrr_0.3.0            
## [100] pillar_1.8.1            lifecycle_1.0.3         jquerylib_0.1.4        
## [103] seriation_1.4.0         httpuv_1.6.9            R6_2.5.1               
## [106] promises_1.2.0.1        TSP_1.2-1               renv_0.14.0            
## [109] parallelly_1.31.1       codetools_0.2-18        boot_1.3-28            
## [112] MASS_7.3-58.1           rprojroot_2.0.3         withr_2.5.0            
## [115] httpcode_0.3.0          mnormt_2.1.1            broom.mixed_0.2.9.4    
## [118] naniar_1.0.0            mgcv_1.8-41             parallel_4.2.2         
## [121] hms_1.1.2               grid_4.2.2              minqa_1.2.4            
## [124] rmarkdown_2.14          snakecase_0.11.0        carData_3.0-5          
## [127] shiny_1.7.4